# -*- coding: utf-8 -*-
import pymongo
from datetime import datetime
import re

import scrapy
from lxml import etree
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from SinaWeiboSpider.items import CommentItem
from SinaWeiboSpider.spiders.utils import extract_comment_content, time_fix
from SinaWeiboSpider.settings import MONGO_URI, DB_NAME


def get_urls():
    client = pymongo.MongoClient(MONGO_URI)
    weibo_urls = client[DB_NAME]['comment'].distinct('weibo_url')
    with open('SinaWeiboSpider/spiders/fangfang_weibo_urls.txt') as f:
    # with open('fangfang_weibo_urls.txt') as f:
        total_urls = {url.strip() for url in f}
    start_urls = total_urls - set(weibo_urls)
    print(f'共{len(total_urls)} 已下载{len(weibo_urls)} 还需下载{len(start_urls)}')
    return start_urls


class SinaWeiboCommentSpider(scrapy.Spider):
    name = 'sina_weibo_comment'
    allowed_domains = ['weibo.cn']
    base_url = 'http://weibo.cn/'

    def start_requests(self):
        # weibo_urls = [
        #     # 'https://weibo.com/5442433081/EeqGGAqFr',
        #     # 'https://weibo.com/1201123261/IwKvpexD2',
        # ]
        # for weibo_url in weibo_urls:
        #     comment_page_url = f'{self.base_url}comment/{weibo_url.split("/")[-1]}&page=1'
        #     yield Request(url=comment_page_url, callback=self.parse_comment, meta={'weibo_url': weibo_url}, dont_filter=True)
        for weibo_url in get_urls():
            yield Request(url=weibo_url, callback=self.parse_comment, meta={'weibo_url': weibo_url}, dont_filter=True)

    def parse_comment(self, response):
        # 如果是第1页，一次性获取后面的所有页
        # if re.search(r'page=(\d+)', response.url).group(1) == '1':
        #     all_page = re.search(r'1/(\d+)页', response.text)
        #     if all_page:
        #         all_page = all_page.group(1)
        #         all_page = int(all_page)
        #         if all_page >= 50:
        #             for page_num in range(2, 51):
        #                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
        #                 yield Request(page_url, self.parse_comment, dont_filter=False, meta=response.meta)
        #         else:
        #             for page_num in range(2, all_page + 1):
        #                 page_url = response.url.replace('page=1', 'page={}'.format(page_num))
        #                 yield Request(page_url, self.parse_comment, dont_filter=False, meta=response.meta)
        tree_node = etree.HTML(response.body)
        comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]')
        for comment_node in comment_nodes:
            comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href')
            if not comment_user_url:
                continue
            comment_item = CommentItem()
            comment_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %X')
            comment_item['weibo_url'] = response.meta['weibo_url']
            comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url[0]).group(1)
            comment_item['content'] = extract_comment_content(etree.tostring(comment_node, encoding='unicode'))
            comment_item['_id'] = comment_node.xpath('./@id')[0]
            created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0]
            like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
            comment_item['like_num'] = int(re.search('\d+', like_num).group())
            comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0])
            yield comment_item


if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('sina_weibo_comment')
    process.start()
    # get_urls()